*___________________________________________________________________________________________________________________________________________________________________
*
**# MERGING THE FILES
*___________________________________________________________________________________________________________________________________________________________________

*unicode encoding list			// to see different possible unicode encodings

// Saving Stata files from each of the .csv files
local list : dir "$EHIS\EHIS wave 3" files "*.csv", respectcase
foreach file in `list'	{
	dis `"`file'"'
	import delimited `"$EHIS\EHIS wave 3\\`file'"', varnames(1) delimit(";") clear bindquote(nobind)		// charset(utf8)
	/* NOTE: this didn't cause an error when I first ran it, but does ever since... I think it's because I was trying to import Stata files as .csvs by mistake! Reassurance also from reading e.g. https://www.statalist.org/forums/forum/general-stata-discussion/general/1623219-binary-zeros-were-ignored-in-the-source-file */
	local newfile = subinstr(`"`file'"', "_Anonymisation.csv", "", .)
	capture drop region			// this is sometimes numeric, and sometimes string (in DE) - to avoid append issues, easiest to drop it
	save "$EHIS\EHIS wave 3\\`newfile'.dta", replace
/**/					}


// Combining these together
local dtalist : dir "$EHIS\EHIS wave 3" files "??.dta"
local i = 1
foreach dta in `dtalist'	{
	dis `"`dta'"'
	if `i'!=1					{
		append using `"${EHIS}\EHIS wave 3\\`dta'"'
		erase `"${EHIS}\EHIS wave 3\\`dta'"'
	/**/						}
	if `i'==1					{
		use `"${EHIS}\EHIS wave 3\\`dta'"', replace
		erase `"${EHIS}\EHIS wave 3\\`dta'"'
		local i = `i' + 1
	/**/						}
/**/					}
rename age agestr
drop pa? 								// in varied formats, so hard to merge with UK data
drop pe?
save "${EHIS}\EHIS wave 3\0_w3merged.dta", replace
/*
use "${EHIS}\EHIS wave 3\0_w3merged.dta", replace
*/

// Merging in UK data (which is properly labelled, as supplied in Stata)
use "${UK_EHIS}\ehis_wave_3_uk.dta", replace
rename *, lower
drop pa?
drop pe?
append using "${EHIS}\EHIS wave 3\0_w3merged.dta"
label values pl2 PL4		// in UK data categories 3 & 4 are combined for pl2, so need to copy label from another var
save "${EHIS}\EHIS wave 3\0_w3raw ${versno}.dta", replace


*___________________________________________________________________________________________________________________________________________________________________
*
**# CLEANING
*___________________________________________________________________________________________________________________________________________________________________
use "${EHIS}\EHIS wave 3\0_w3raw ${versno}.dta", replace


// Things that are different in UK and Eurostat data
numlabel _all, add mask(#_)
tostring refdate, replace
gen _refyear 	= substr(refdate,1,4) 	if country!="UK" 
gen _refmth 	= substr(refdate,5,2) 	if country!="UK" 
destring _refyear _refmth, replace
replace refyear  = _refyear 				if country!="UK"	
replace refmonth = _refmth  				if country!="UK"	
drop _refyear _refmth refdate
* BMI
order bm1 bm2, after(bmi)
label var bm1		"Height without shoes in cm (not UK)"
label var bm2		"Weight without clothes in kg (not UK)"
* Age
label define AGE -1 "-1_inconsistent:16-17" -2 "-2_inconsistent:15-17" -3 "-3_inconsistent:15-19" -4 "-4_inconsistent:18-19" ///
	-11 "-11_inconsistent:75+" -12 "-12_inconsistent:75_79" -13 "-13_inconsistent:80+" -14 "-14_inconsistent:80-84" -15 "-15_inconsistent:85+", modify
recode age (1=-1)(2=-4)(14=-12)(15=-14)(16=-15) 
replace age = 3		if agestr=="20-24"
replace age = 4		if agestr=="25-29"	
replace age = 5		if agestr=="30-34"
replace age = 6		if agestr=="35-39"
replace age = 7		if agestr=="40-44"
replace age = 8		if agestr=="45-49"
replace age = 9		if agestr=="50-54"
replace age = 10	if agestr=="55-59"
replace age = 11	if agestr=="60-64"
replace age = 12	if agestr=="65-69"
replace age = 13	if agestr=="70-74"
replace age = -2 	if agestr=="15-17" 
replace age = -3 	if agestr=="15-19" 
replace age = -4 	if agestr=="18-19"
replace age = -11	if agestr=="75+"
replace age = -12	if agestr=="75-79"
replace age = -13	if agestr=="80+"
replace age = -14	if agestr=="80-84"
replace age = -15	if agestr=="85+"
drop agestr
* tab country if inrange(age,3,12)		// to check sample size by country
keep if inrange(age,3,12)


// General survey things
* Setting proxy responses to missing
unab mainvars: hatlevel-ic3
foreach var in `mainvars'	{
	replace `var' = .p if proxy!=1
/**/					}
* Country as categorical (not string) var
encode country, gen(countrynum)
order countrynum, after(country)
* Other vars
recode intmethod (-1=50) // this is Portugal - from w3 Quality Report p24, which says this is f2f and online - but limited detail here
recode intmethod (40=12) // this is Denmark - from w3 Quality Report p23 & p25, says this is self-completion online
replace refyear=2019 if country=="ES"			// this is actually 2019-20 (see Quality Report p28)


// Mental health scale (PHQ-8)
// Note: 291 ppl are labelled 'proxy' in IT, contradicting proxy==1 - given the absence of a refusal category in IT, I've assumed these should be refusals (see also pn1 - 243 of these 291 people overlap - and hs1)
unab phq: mh*
foreach var in `phq'	{
	recode   `var' (-3 -1=.r)(1=0 "0_Not at all")(2=1 "1_Several days")(3=2 "2_More than half the days")(4=3 "3_Nearly every day"), gen(_`var')
	replace _`var' = .n if country=="ES"			// removed due to confidentiality reasons, at ES request
/**/					}
*tab country mh1a, m row nof
egen PHQmiss 	=  rowmiss(_mh1a-_mh1h)
egen PHQscore 	= rowtotal(_mh1a-_mh1h) 
recode PHQscore (0/4 = 1 "1_Minimal depression")(5/9=2 "2_Mild depression")(10/14=3 "3_Moderate depression")(15/19=4 "4_Moderately severe depression") ///
	/**/	(20/24=5 "5_Severe depression"), gen(PHQgroup)
replace PHQgroup = .r if inrange(PHQmiss,2,9)
label var PHQmiss			"dv Number of missing items in PHQ scale"
label var PHQscore			"dv Total PHQ score (0-24), inc. partial responses"
label var PHQgroup			"dv PHQ classification, inc. partial responses"
* Individual binary vars for this
tab PHQgroup, gen(PHQgroup)
numlabel PHQgroup, mask(#_) remove
forvalues i = 1/5	{
		local thislab: label PHQgroup `i'
		local thislab = subinstr("`thislab'", " depression", "", .)
		label var PHQgroup`i'	"dv PHQ #`i': `thislab' (inc. partial responses)"
/**/				}
recode PHQgroup (1/2=0 "0_Minimal/mild depression")(3/5=1 "1_Moderate/severe depression"), gen(PHQgroupB)
	label var PHQgroupB		"dv PHQ moderate/severe (inc. partial responses)"
numlabel PHQgroup, mask(#_) add
* Final tidying
unab PHQvars: mh1? 
foreach var in `PHQvars'	{
	local thislab: var lab `var'
	label var `var' 	"PHQ: `thislab'"
/**/						}
order PHQmiss-PHQgroup5 PHQgroupB, after(mh1h)
drop _mh*


// Physical limitations
// 		NOTE: do this differently at w2, when pl1 and pl3 have =3 options (which don't seem to be used at w3)
** Vision ** 
*tab country pl2 if !inlist(pl1,1,2), m		// routing is consistent, despite UK documentation saying that routing was wrong in UK if DK/ref at pl1
replace pl2=.n if country=="FI"				// oddly a few people have pl2==4 in FI, but nearly all missing
recode pl1 pl2 (-1 -2=.r)
** Hearing **
*tab country pl4 if !inlist(pl3,1,2), m		// routing is consistent, despite UK documentation saying that routing was wrong in UK if DK/ref at pl1
replace pl5 = 4 if pl4==4 & pl5==-2			// these are people that weren't asked the follow-up hearing q because clear what they would say
/* checking routing is consistent after adjustment in previous line
	egen hearingmiss = concat(pl3-pl5), punct(" | ")
	drop hearingmiss
*/
recode pl3 pl4 pl5 (-1 -2=.r)
** Walking **
*tab pl7 if pl6==4, m						//  routing is consistent
recode pl6 pl7 (-1 -2=.r)


// Pain
// Note: 265 ppl are labelled 'proxy' in IT, contradicting proxy==1 - given the absence of a refusal category in IT, I've assumed these should be refusals (see also mh1a - 243 of these 265 people overlap)
*tab intmethod pn1 if inlist(pn2,-1,-3) & !inlist(pn1,-1,-3) & country=="FI", m
replace pn2 = 1 if pn1==1 & inlist(pn2,-1) & country=="FI"			// just in FI, most ppl didn't respond to pn2 if they said 'not at all' in pn1. In other countries ≈97% who said 'not at all' at pn1 said 'not at all' at pn2. 
*tab intmethod pn1 if inlist(pn2,-1,-3) & !inlist(pn1,-1,-3) & country=="LU", m
replace pn2 = 1 if pn1==1 & inlist(pn2,-1) & country=="LU"			// a less clear pattern than FI, but still seems to be the same issue
recode pn1 pn2 (-1 -3=.r)


// Condition vars
recode cd1a cd1b cd1c cd1d cd1e cd1f cd1g cd1h cd1i cd1j cd1k cd1l cd1m cd1n cd1o cd1p (-1=.r)(2=0), gen(cd1aR cd1bR cd1cR cd1dR cd1eR cd1fR cd1gR cd1hR cd1iR cd1jR cd1kR cd1lR cd1mR cd1nR cd1oR cd1pR)
foreach var in a b c d e f g h i j k l m n o p	{
	local thislab: var label cd1`var'
	label var cd1`var'R "`thislab' recoded"
/**/											}
order cd1aR-cd1pR, after(cd1p)
drop cd1a-cd1p


// BMI
/*
version 14
local var "bm1"
table country if sex==1 & `var'>0, c(count pid count `var' min `var' max `var' )
table country if sex==2 & `var'>0, c(count pid count `var' min `var' max `var' )
local var "bm2"
table country if sex==1 & `var'>0, c(count pid count `var' min `var' max `var' )
table country if sex==2 & `var'>0, c(count pid count `var' min `var' max `var' )
*/
clonevar bmigroup = bmi
	label var bmi			"Do not use: BMI as supplied"
	label var bmigroup		"BMI group (based on supplied BMI)"
	replace bmigroup		 = . 		if !inlist(country,"UK", "IT")
	recode bmi (-3 -1=-1) (0/18.49999=1) (18.5/24.9999=2) (25/29.99999=3) (30/39.9999=4) (40/max=5), gen(_bmi)		// one '-3' value in NO, indicating another sort of problem (but unlikely to be proxy)
	replace bmigroup		 = _bmi 	if !inlist(country,"UK", "IT")
	recode bmigroup (-1=.r)
	replace bmigroup = .i if country=="IE"				// very strange distribution, as well as very high missingness, makes this seem very strange
/* This is my effort to create a consistent BMI given top/bottom coding - but this is impossible, because UK not comparable, and because top-coding BMI at 35.5 is ridiculous
gen bmigroupR = bmigroup if country=="UK"
	label var bmigroupR			"BMI revised group (dv, see notes)"
	* The revisions below comes from the w2 PDF document from Eurostat - the issue is really Italy does this (and to a lesser extent Ireland). But can't do it in the UK, where don't supply height and weight
	gen 	_bmiR =  bm2 / ((bm1/100)^2) if inrange(bm1,145,195) & inrange(bm2,45,130) & sex==1 	// men
	replace _bmiR =  bm2 / ((bm1/100)^2) if inrange(bm1,140,180) & inrange(bm2,40,120) & sex==2 	// women
	recode _bmiR (.=-1)(0/18.49999=1) (18.5/24.9999=2) (25/29.99999=3) (30/39.9999=4) (40/max=5), gen(_bmigroupR)
	replace _bmiR = .i if inrange(_bmiR,0,15.499999) | inrange(_bmiR,35.5000001, 200)			
	replace bmigroupR = _bmigroupR if country!="UK"
label values bmigroup bmigroupR BMI
*/
order bmigroup*, after(bmi)
drop _bmi*


// LLSI 
recode hs3 (-1=.r)
recode hs3 (1=1 "1_severely limited") (2 3=0), gen(severellsiB)
recode hs3 (1 2=1 "1_any limitations")(  3=0), gen(llsiB)
	label var llsiB 		"dv: LLSI any limitations (from hs3)"
	label var severellsiB 	"dv: LLSI severe limitations (from hs3)"
order *llsiB, after(hs3)


// Employment status
recode mainstat (-1=.r)(70=80)		// 70=Compulsory military or civilian service, which is recoded into 'Other' in UK for anonymity reasons (and in several countries by the looks of it)
recode mainstat (10=1 "1_working")(20/80=0 "0_not working"), gen(workB)
order workB, after(mainstat)
label var workB 		"dv Working (binary var)"


// Education - just doesn't really look comparable


// Weights
label var wgt				"Weight inc proxy interviews"
label var wgt_spec			"Weight exc proxy interviews (CZ, EE, ES, IE, PL, RO, SE, UK only)"		// this is made clear in fn101 of 2020 methodological manual, and also UK dataset documentation section 6.2
replace wgt_spec = . if wgt_spec==-1
/* Checks
label define weightpattern 1 "1_both w8s" 2 "2_non-proxy but missing wgt_spec" 3 "3_non-proxy but missing wgt" 4 "4_missing both" 9 "9_proxy & missing wgt_spec"
gen weightpattern = 1 if !missing(wgt) & !missing(wgt_spec)
	replace weightpattern = 2 if  missing(wgt_spec) & !missing(wgt) 
	replace weightpattern = 3 if !missing(wgt_spec) &  missing(wgt) 
	replace weightpattern = 4 if  missing(wgt_spec) &  missing(wgt) 
label values weightpattern weightpattern 
bysort proxy: tab country weightpattern 
*/
svyset [pw=wgt]


// Combining rare categories
recode PHQgroup (1=1 "1_Minimal depression")(2=2 "2_Mild depression")(3=3 "3_Moderate depression")(4 5=4 "4_Moderately severe or severe depression"), gen(PHQgroupR)
	order PHQgroupR, after(PHQgroup)
label define plR 1 "1_No difficulty" 2 "2_Some difficulty" 3 "3_A lot of difficulty/cannot do at all"
	recode pl2 pl4 pl5 pl6 pl7 (3/4=3), gen(pl2R pl4R pl5R pl6R pl7R)
	label values pl2 pl4R pl5R pl6R pl7R plR
	foreach i in 2 4 5 6 7	{
		local thislab: var label pl`i'
		label var pl`i'R "`thislab' recoded"
	/**/					}
label define pl6and7R 0 "0_no difficulties" 1 "1_some difficulty with either walking 1/2km or steps" 2 "2_some difficulty with both" 3 "3_a lot of difficulty with at least one", replace
	gen 	pl6and7R = 0 if pl6R==1 & pl7R==1
	replace pl6and7R = 1 if inlist(2,pl6R,pl7R)
	replace pl6and7R = 2 if pl6R==2 & pl7R==2
	replace pl6and7R = 3 if inlist(3,pl6R,pl7R)
	label values pl6and7R pl6and7R 
	label var pl6and7R "Difficulty in walking on level ground and up steps combined (from pl6 and pl7)"
order pl2R pl4R pl5R pl6R pl7R pl6and7R, after(pl7)


// Final tidying
drop birthplace*
drop cd2-aw2 ho12-un2d dh1-ic3
compress
erase 	"${EHIS}\EHIS wave 3\0_w3raw ${versno}.dta"	
erase 	"${EHIS}\EHIS wave 3\0_w3merged.dta"
save 	"${EHIS}\EHIS wave 3\0_w3cleaned ${versno}.dta", replace
/*
	use "${EHIS}\EHIS wave 3\0_w3cleaned ${versno}.dta", replace
*/
